Each preference measure is standardized at the individual level, so that, by construction, each preference has a mean of zero and a standard deviation of one in the individual-level world sample.
#Load packages
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.8 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:data.table’:
between, first, last
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
library(lmtest)
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
library(maps)
library(mapdata)
#Add data GPS
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")
head(country_data)
# Determine the number of different countries
number_of_countries <- length(unique(gps_data$country))
# Display the number of different countries
number_of_countries
[1] 76
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))
# Display the cleaned data
cleaned_data
# Display the number of records removed per variable
records_removed_per_variable
country isocode ison region language date
0 0 0 0 0 0
id_gallup wgt patience risktaking posrecip negrecip
0 0 190 634 32 247
altruism trust subj_math_skills gender age
74 163 132 0 276
# List all variables
variable_list <- names(cleaned_data) # OR colnames(your_data)
# Display the list of variables
print(variable_list)
[1] "country" "isocode" "ison" "region" "language"
[6] "date" "id_gallup" "wgt" "patience" "risktaking"
[11] "posrecip" "negrecip" "altruism" "trust" "subj_math_skills"
[16] "gender" "age"
#select only the variables of interest
cleaned_data <- cleaned_data %>%
select(country, isocode, ison, date, risktaking, gender, age)
cleaned_data
cleaned_data$agecat <- cut(
cleaned_data$age,
breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
right = FALSE # Left end (inclusive), right end (exclusive)
)
# Display the new column
head(cleaned_data)
# Determine the number of different countries
number_of_countries <- length(unique(cleaned_data$country))
# Display the number of different countries
number_of_countries
[1] 76
world_map <- map_data("world") # Create a world map with country borders
recorded_countries <- unique(cleaned_data$country) # Get the list of recorded countries from your cleaned_data
world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded") # Create a new variable indicating whether a country has been recorded or not
# Plot the world map with recorded countries highlighted
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
geom_polygon(color = "white") +
scale_fill_manual(values = c("Recorded" = "darkblue", "Not Recorded" = "lightgrey"), guide = "legend") +
theme_void() +
labs(title = "Recorded Countries on World Map", fill = "Status") +
theme(legend.position = "bottom")
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)
# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)
# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)
# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
Age Range: 15 to 99
cat("Average Age: ", average_age, "\n")
Average Age: 41.73605
cat("Median Age: ", median_age, "\n")
Median Age: 40
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)
# Display the number of participants in the age categories
print(agecat_counts)
15-19 20-29 30-39 40-49 50-59 60-69 70-79 80+
6888 16872 15905 13583 11374 8570 4688 1559
ggplot(cleaned_data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "blue") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
ggplot(cleaned_data, aes(x = country, y = age)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Distribution of Age per Country", x = "Country", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)
# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
Number of Females: 36024
cat("Number of Males: ", gender_counts[2], "\n")
Number of Males: 43415
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)
# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)
# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)
# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
country = rownames(gender_by_country),
total_participants = total_participants,
female = gender_by_country[, "Female"],
male = gender_by_country[, "Male"]
)
# Display the result table
result_table
# Summary statistics for risktaking
risk_summary <- summary(cleaned_data$risktaking)
# Histogram of risktaking
ggplot(cleaned_data, aes(x = risktaking)) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "blue", alpha = 0.7) +
labs(title = "Distribution of Risk-Taking", x = "Risk-Taking Score", y = "Frequency") +
theme_minimal()
# Boxplot of risktaking by gender
ggplot(cleaned_data, aes(x = as.factor(gender), y = risktaking, fill = as.factor(gender))) +
geom_boxplot() +
labs(title = "Risk-Taking by Gender", x = "Gender", y = "Risk-Taking Score") +
scale_x_discrete(labels = c("0" = "male", "1" = "female")) +
theme_minimal() +
guides(fill = FALSE)
# Risk vs age
ggplot(cleaned_data, aes(risktaking, age)) +
geom_point(size = 0.2) +
geom_smooth(method = "lm")
# Boxplot of risktaking by age group
cleaned_data$age_group <- cut(cleaned_data$age, breaks = c(0, 25, 35, 45, 55, Inf),
labels = c("18-25", "26-35", "36-45", "46-55", "56+"))
ggplot(cleaned_data, aes(x = age_group, y = risktaking, fill = age_group)) +
geom_boxplot() +
labs(title = "Risk-Taking by Age Group", x = "Age Group", y = "Risk-Taking Score") +
theme_minimal() +
guides(fill = FALSE)
# Boxplot of risktaking by country
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Risk-Taking by Country", x = "Country", y = "Risk-Taking Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Fit das Modell mit "age"
model_gender <- lm(risktaking ~ age, data = cleaned_data)
# Intercept and Slope for age
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]
summary(model)
Call:
lm(formula = risktaking ~ age, data = cleaned_data)
Residuals:
Min 1Q Median 3Q Max
-2.24386 -0.68385 -0.04386 0.65311 3.05477
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.5759701 0.0089312 64.49 <2e-16 ***
age -0.0137902 0.0001974 -69.84 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9707 on 79437 degrees of freedom
Multiple R-squared: 0.05786, Adjusted R-squared: 0.05784
F-statistic: 4878 on 1 and 79437 DF, p-value: < 2.2e-16
# Fit das Modell mit "gender"
model_gender <- lm(risktaking ~ gender, data = cleaned_data)
# Intercept und Slope für gender
intercept_gender <- coef(model_gender)["(Intercept)"]
slope_gender <- coef(model_gender)["gender"]
summary(model_gender)
Call:
lm(formula = risktaking ~ gender, data = cleaned_data)
Residuals:
Min 1Q Median 3Q Max
-1.9994 -0.7148 -0.0128 0.6792 2.5688
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.124666 0.005235 23.81 <2e-16 ***
gender -0.227338 0.007081 -32.10 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9936 on 79437 degrees of freedom
Multiple R-squared: 0.01281, Adjusted R-squared: 0.0128
F-statistic: 1031 on 1 and 79437 DF, p-value: < 2.2e-16
# Group the data by country
table_data <- gps_data %>%
group_by(country, isocode) %>%
summarize(
n = n(),
female_percentage = mean(gender == 1) * 100,
mean_age = mean(age, na.rm = TRUE),
age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
mean_risktaking = mean(risktaking, na.rm = TRUE)
)
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# Intercept and slope for age
intercept_age <- 0.5759701
slope_age <- -0.0137902
# Intercept and slope for gender
intercept_gender <- 0.124666
slope_gender <- -0.227338
# Add the intercept and slope information to the table
table_data <- table_data %>%
mutate(
intercept_age = intercept_age,
slope_age = slope_age,
intercept_gender = intercept_gender,
slope_gender = slope_gender,
)
# Display the updated table
table_data
NA
# Scatterplot to check for linearity
ggplot(cleaned_data, aes(x = age, y = risktaking)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot for Linearity Check")
# Residual vs. Fitted plot to check for homoskedasticity
model <- lm(risktaking ~ age, data = cleaned_data)
ggplot() +
geom_point(aes(x = fitted(model), y = residuals(model))) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Fitted Values for Homoskedasticity Check")
# Q-Q plot to check for normality of residuals
qqnorm(residuals(model))
qqline(residuals(model))
# Correlation matrix to check for multicollinearity
cor(cleaned_data[c("age", "risktaking")])
age risktaking
age 1.0000000 -0.2405333
risktaking -0.2405333 1.0000000
# Durbin-Watson test for autocorrelation
dwtest(model)
Durbin-Watson test
data: model
DW = 1.5415, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
# Residual vs. Leverage plot to check for outliers
ggplot() +
geom_point(aes(x = hatvalues(model), y = residuals(model))) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Leverage for Outlier Check")
#Preparing for linear regression
# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)
# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
Missing values in 'Country': FALSE
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")
Missing values in 'Risktaking': FALSE
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, risktaking, age)
# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
group_by(country) %>%
do(model = lm(risktaking ~ age, data = .)) %>%
summarize(
country = first(country),
intercept = summary(model)$coefficients[1],
slope = summary(model)$coefficients[2],
r_squared = summary(model)$r.squared
)
# Display regression results for each country
print(regression_results)
ggplot(data = regression_results, aes(x = country, y = intercept)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
high_intercept_countries <- subset(regression_results, intercept > 0.75)
# View the countries with intercept values over 0.75
print(high_intercept_countries)
# Create scatterplots with regression lines for countries with intercept > 0.75 and smaller points
plots <- lapply(1:nrow(regression_results), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
if (intercept > 0.75) {
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point(size = 0.5) + # Set the size to a smaller value (e.g., 2)
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2)))
print(p)
}
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots for countries with intercept > 0.75 and smaller points are saved in the 'individual_country_plots' directory.")
[1] "Individual plots for countries with intercept > 0.75 and smaller points are saved in the 'individual_country_plots' directory."
# Examples for countries
regression_results <- data.frame(
country = c("Algeria", "Argentina", "Austria"),
intercept = c(0.92053422, 0.51698822, 0.42606684),
slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02 )
)
# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
slope <- regression_results$slope[i]
r_squared <- regression_results$r_squared[i]
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2),
"Slope:", round(slope, 2),
"R-squared:", round(r_squared, 2)))
print(p)
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots are saved in the 'individual_country_plots' directory.")
[1] "Individual plots are saved in the 'individual_country_plots' directory."
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.2) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5, linetype = "solid") + # Adjust line size and type
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
Please use `linewidth` instead.
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)
# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.5) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2],
color = "black", size = 1) + # Add the overall regression line in red
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key
# Load the data
WV5_data <- readRDS("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/F00007944-WV5_Data_R_v20180912.rds") # Data of Wave 5
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, risktaking = V86)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, risktaking)
WV5_data
#decode the country names
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia Brazil Bulgaria
1003 1002 1421 1500 1001
Burkina Faso Canada Chile China Colombia
1534 2164 1000 1991 3025
Cyprus (G) Egypt Ethiopia Finland France
1050 3051 1500 1014 1001
Georgia Germany Ghana Great Britain Guatemala
1500 2064 1534 1041 1000
Hong Kong Hungary India Indonesia Iran
1252 1007 2001 2015 2667
Iraq Italy Japan Jordan Malaysia
2701 1012 1096 1200 1201
Mali Mexico Moldova Morocco Netherlands
1534 1560 1046 1200 1050
New Zealand Norway Peru Poland Romania
954 1025 1500 1000 1776
Russia Rwanda Slovenia South Africa South Korea
2033 1507 1037 2988 1200
Spain Sweden Switzerland Taiwan Thailand
1200 1003 1241 1227 1534
Trinidad and Tobago Turkey Ukraine United States Uruguay
1002 1346 1000 1249 1000
Viet Nam Zambia
1495 1500
WV5_data
#Read Dataset (Wave 6)
WV6_data <- load("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
#rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, risktaking = V76)
#select only the variables of interest
WV6_data <- WV6_data %>%
select(wave, sex, age, country, sex,risktaking)
WV6_data
#decode daraset (Wave 6)
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia Australia Azerbaijan
1200 1030 1100 1477 1002
Belarus Brazil Chile China Colombia
1535 1486 1000 2300 1512
Cyprus (G) Ecuador Egypt Estonia Georgia
1000 1202 1523 1533 1202
Germany Ghana Haiti Hong Kong India
2046 1552 1996 1000 4078
Iraq Japan Jordan Kazakhstan Kuwait
1200 2443 1200 1500 1303
Kyrgyzstan Lebanon Libya Malaysia Mexico
1500 1200 2131 1300 2000
Morocco Netherlands New Zealand Nigeria Pakistan
1200 1902 841 1759 1200
Palestine Peru Philippines Poland Qatar
1000 1210 1200 966 1060
Romania Russia Rwanda Singapore Slovenia
1503 2500 1527 1972 1069
South Africa South Korea Spain Sweden Taiwan
3531 1200 1189 1206 1238
Thailand Trinidad and Tobago Tunisia Turkey Ukraine
1200 999 1205 1605 1500
United States Uruguay Uzbekistan Yemen Zimbabwe
2232 1000 1500 1000 1500
WV6_data
cleaned_data <- cleaned_data %>%
rename(country_lab = country)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(country)
# Now:
data %>% select(all_of(country))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Error in `rename()`:
! Can't rename columns that don't exist.
✖ Column `Zimbabwe` doesn't exist.
Backtrace:
1. cleaned_data %>% rename(country_lab = country)
3. dplyr:::rename.data.frame(., country_lab = country)
#combine the 3 dataset (Wave 6 + Wave 5 + gps)
combined_data <- rbind(WV5_data, WV6_data, cleaned_data)
Error in rbind(deparse.level, ...) :
numbers of columns of arguments do not match
# extract the countries in each file
country_WV5 <- unique(WV5_data$country)
country_WV6 <- unique(WV6_data$country)
country_gps <- unique(cleaned_data$country)
# find common countries
common_country <- Reduce(intersect, list(country_WV5, country_WV6, country_gps))
# Print the common countries
print(common_country)